<!DOCTYPE HTML>
<html lang="en" >
    <!-- Start book Python爬虫课程讲义 -->
    <head>
        <!-- head:start -->
        <meta charset="UTF-8">
        <meta http-equiv="X-UA-Compatible" content="IE=edge" />
        <title>BeautifulSoup4 解析器 | Python爬虫课程讲义</title>
        <meta content="text/html; charset=utf-8" http-equiv="Content-Type">
        <meta name="description" content="">
        <meta name="generator" content="GitBook 2.6.7">
        <meta name="author" content="BigCat">
        
        <meta name="HandheldFriendly" content="true"/>
        <meta name="viewport" content="width=device-width, initial-scale=1, user-scalable=no">
        <meta name="apple-mobile-web-app-capable" content="yes">
        <meta name="apple-mobile-web-app-status-bar-style" content="black">
        <link rel="apple-touch-icon-precomposed" sizes="152x152" href="../../gitbook/images/apple-touch-icon-precomposed-152.png">
        <link rel="shortcut icon" href="../../gitbook/images/favicon.ico" type="image/x-icon">
        
    <link rel="stylesheet" href="../../gitbook/style.css">
    
        
        <link rel="stylesheet" href="../../gitbook/plugins/gitbook-plugin-tbfed-pagefooter/footer.css">
        
    
        
        <link rel="stylesheet" href="../../gitbook/plugins/gitbook-plugin-splitter/splitter.css">
        
    
        
        <link rel="stylesheet" href="../../gitbook/plugins/gitbook-plugin-toggle-chapters/toggle.css">
        
    
        
        <link rel="stylesheet" href="../../gitbook/plugins/gitbook-plugin-highlight/website.css">
        
    
        
        <link rel="stylesheet" href="../../gitbook/plugins/gitbook-plugin-fontsettings/website.css">
        
    
    

        
    
    
    <link rel="next" href="../../file/part02/2.6.html" />
    
    
    <link rel="prev" href="../../file/part02/2.4.html" />
    

        <!-- head:end -->
    </head>
    <body>
        <!-- body:start -->
        
    <div class="book"
        data-level="2.5"
        data-chapter-title="BeautifulSoup4 解析器"
        data-filepath="file/part02/2.5.md"
        data-basepath="../.."
        data-revision="Thu Feb 09 2017 09:48:59 GMT+0800 (CST)"
        data-innerlanguage="">
    

<div class="book-summary">
    <nav role="navigation">
        <ul class="summary">
            
            
            
            

            

            
    
        <li class="chapter " data-level="0" data-path="index.html">
            
                
                    <a href="../../index.html">
                
                        <i class="fa fa-check"></i>
                        
                        传智播客Python学院爬虫课程
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="1" data-path="file/part01/1.html">
            
                
                    <a href="../../file/part01/1.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>1.</b>
                        
                        爬虫原理与数据抓取
                    </a>
            
            
            <ul class="articles">
                
    
        <li class="chapter " data-level="1.1" data-path="file/part01/1.1.html">
            
                
                    <a href="../../file/part01/1.1.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>1.1.</b>
                        
                        (了解)通用爬虫和聚焦爬虫
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="1.2" data-path="file/part01/1.2.html">
            
                
                    <a href="../../file/part01/1.2.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>1.2.</b>
                        
                        (复习)HTTP/HTTPS的请求与响应
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="1.3" data-path="file/part01/1.3.html">
            
                
                    <a href="../../file/part01/1.3.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>1.3.</b>
                        
                        HTTP/HTTPS抓包工具-Fiddler
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="1.4" data-path="file/part01/1.4.html">
            
                
                    <a href="../../file/part01/1.4.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>1.4.</b>
                        
                        urllib2模块的基本使用
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="1.5" data-path="file/part01/1.5.html">
            
                
                    <a href="../../file/part01/1.5.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>1.5.</b>
                        
                        urllib2：GET请求和POST请求
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="1.6" data-path="file/part01/1.6.html">
            
                
                    <a href="../../file/part01/1.6.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>1.6.</b>
                        
                        urllib2：Handler处理器和自定义Opener
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="1.7" data-path="file/part01/1.7.html">
            
                
                    <a href="../../file/part01/1.7.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>1.7.</b>
                        
                        urllib2：URLError与HTTPError
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="1.8" data-path="file/part01/1.8.html">
            
                
                    <a href="../../file/part01/1.8.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>1.8.</b>
                        
                        Requests模块
                    </a>
            
            
        </li>
    

            </ul>
            
        </li>
    
        <li class="chapter " data-level="2" data-path="file/part02/2.html">
            
                
                    <a href="../../file/part02/2.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>2.</b>
                        
                        非结构化数据与结构化数据提取
                    </a>
            
            
            <ul class="articles">
                
    
        <li class="chapter " data-level="2.1" data-path="file/part02/2.1.html">
            
                
                    <a href="../../file/part02/2.1.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>2.1.</b>
                        
                        正则表达式re模块
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="2.2" data-path="file/part02/2.2.html">
            
                
                    <a href="../../file/part02/2.2.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>2.2.</b>
                        
                        案例：使用正则表达式的爬虫
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="2.3" data-path="file/part02/2.3.html">
            
                
                    <a href="../../file/part02/2.3.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>2.3.</b>
                        
                        XPath与lxml类库
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="2.4" data-path="file/part02/2.4.html">
            
                
                    <a href="../../file/part02/2.4.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>2.4.</b>
                        
                        案例：使用XPath的爬虫
                    </a>
            
            
        </li>
    
        <li class="chapter active" data-level="2.5" data-path="file/part02/2.5.html">
            
                
                    <a href="../../file/part02/2.5.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>2.5.</b>
                        
                        BeautifulSoup4 解析器
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="2.6" data-path="file/part02/2.6.html">
            
                
                    <a href="../../file/part02/2.6.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>2.6.</b>
                        
                        案例：使用bs4的爬虫
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="2.7" data-path="file/part02/2.7.html">
            
                
                    <a href="../../file/part02/2.7.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>2.7.</b>
                        
                        JSON模块与JsonPath
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="2.8" data-path="file/part02/2.8.html">
            
                
                    <a href="../../file/part02/2.8.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>2.8.</b>
                        
                        糗事百科案例
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="2.9" data-path="file/part02/2.9.html">
            
                
                    <a href="../../file/part02/2.9.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>2.9.</b>
                        
                        多线程爬虫案例
                    </a>
            
            
        </li>
    

            </ul>
            
        </li>
    
        <li class="chapter " data-level="3" data-path="file/part03/3.html">
            
                
                    <a href="../../file/part03/3.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>3.</b>
                        
                        动态HTML处理和机器图像识别
                    </a>
            
            
            <ul class="articles">
                
    
        <li class="chapter " data-level="3.1" data-path="file/part03/3.1.html">
            
                
                    <a href="../../file/part03/3.1.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>3.1.</b>
                        
                        动态HTML介绍
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="3.2" data-path="file/part03/3.2.html">
            
                
                    <a href="../../file/part03/3.2.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>3.2.</b>
                        
                        Selenium与PhantomJS
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="3.3" data-path="file/part03/3.3.html">
            
                
                    <a href="../../file/part03/3.3.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>3.3.</b>
                        
                        案例一：网站模拟登录
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="3.4" data-path="file/part03/3.4.html">
            
                
                    <a href="../../file/part03/3.4.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>3.4.</b>
                        
                        案例二：动态页面模拟点击
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="3.5" data-path="file/part03/3.5.html">
            
                
                    <a href="../../file/part03/3.5.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>3.5.</b>
                        
                        案例三：执行JavaScript语句
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="3.6" data-path="file/part03/3.6.html">
            
                
                    <a href="../../file/part03/3.6.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>3.6.</b>
                        
                        机器视觉与Tesseract介绍
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="3.7" data-path="file/part03/3.7.html">
            
                
                    <a href="../../file/part03/3.7.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>3.7.</b>
                        
                        处理一些格式规范的文字
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="3.8" data-path="file/part03/3.8.html">
            
                
                    <a href="../../file/part03/3.8.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>3.8.</b>
                        
                        案例：尝试对验证码进行机器识别处理
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="3.9" data-path="file/part03/3.9.html">
            
                
                    <a href="../../file/part03/3.9.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>3.9.</b>
                        
                        机器学习：训练Tesseract
                    </a>
            
            
        </li>
    

            </ul>
            
        </li>
    
        <li class="chapter " data-level="4" data-path="file/part04/4.html">
            
                
                    <a href="../../file/part04/4.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>4.</b>
                        
                        Scrapy框架
                    </a>
            
            
            <ul class="articles">
                
    
        <li class="chapter " data-level="4.1" data-path="file/part04/4.1.html">
            
                
                    <a href="../../file/part04/4.1.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>4.1.</b>
                        
                        配置安装
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="4.2" data-path="file/part04/4.2.html">
            
                
                    <a href="../../file/part04/4.2.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>4.2.</b>
                        
                        入门案例
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="4.3" data-path="file/part04/4.3.html">
            
                
                    <a href="../../file/part04/4.3.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>4.3.</b>
                        
                        Scrapy Shell
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="4.4" data-path="file/part04/4.4.html">
            
                
                    <a href="../../file/part04/4.4.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>4.4.</b>
                        
                        Item Pipeline
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="4.5" data-path="file/part04/4.5.html">
            
                
                    <a href="../../file/part04/4.5.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>4.5.</b>
                        
                        Spiders
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="4.6" data-path="file/part04/4.6.html">
            
                
                    <a href="../../file/part04/4.6.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>4.6.</b>
                        
                        CrawlSpiders
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="4.7" data-path="file/part04/4.7.html">
            
                
                    <a href="../../file/part04/4.7.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>4.7.</b>
                        
                        Request/Response
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="4.8" data-path="file/part04/4.8.html">
            
                
                    <a href="../../file/part04/4.8.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>4.8.</b>
                        
                        Downloader Middlewares
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="4.9" data-path="file/part04/4.9.html">
            
                
                    <a href="../../file/part04/4.9.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>4.9.</b>
                        
                        Settings
                    </a>
            
            
        </li>
    

            </ul>
            
        </li>
    
        <li class="chapter " data-level="5" data-path="file/part05/5.html">
            
                
                    <a href="../../file/part05/5.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>5.</b>
                        
                        Scrapy实战项目
                    </a>
            
            
            <ul class="articles">
                
    
        <li class="chapter " data-level="5.1" data-path="file/part05/5.1.html">
            
                
                    <a href="../../file/part05/5.1.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>5.1.</b>
                        
                        (案例一)手机App抓包爬虫
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="5.2" data-path="file/part05/5.2.html">
            
                
                    <a href="../../file/part05/5.2.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>5.2.</b>
                        
                        (案例二)阳光热线问政平台爬虫
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="5.3" data-path="file/part05/5.3.html">
            
                
                    <a href="../../file/part05/5.3.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>5.3.</b>
                        
                        (案例三)新浪网分类资讯爬虫
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="5.4" data-path="file/part05/5.4.html">
            
                
                    <a href="../../file/part05/5.4.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>5.4.</b>
                        
                        (案例四)图片下载器爬虫
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="5.5" data-path="file/part05/5.5.html">
            
                
                    <a href="../../file/part05/5.5.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>5.5.</b>
                        
                        (案例五)将数据保存在MongoDB中
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="5.6" data-path="file/part05/5.6.html">
            
                
                    <a href="../../file/part05/5.6.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>5.6.</b>
                        
                        (案例六)三种scrapy模拟登陆策略
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="5.7" data-path="file/part05/5.7.html">
            
                
                    <a href="../../file/part05/5.7.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>5.7.</b>
                        
                        附：通过Fiddler进行手机抓包方法
                    </a>
            
            
        </li>
    

            </ul>
            
        </li>
    
        <li class="chapter " data-level="6" data-path="file/part06/6.html">
            
                
                    <a href="../../file/part06/6.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>6.</b>
                        
                        scrapy-redis分布式组件
                    </a>
            
            
            <ul class="articles">
                
    
        <li class="chapter " data-level="6.1" data-path="file/part06/6.1.html">
            
                
                    <a href="../../file/part06/6.1.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>6.1.</b>
                        
                        源码分析参考：Connection
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="6.2" data-path="file/part06/6.2.html">
            
                
                    <a href="../../file/part06/6.2.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>6.2.</b>
                        
                        源码分析参考：Dupefilter
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="6.3" data-path="file/part06/6.3.html">
            
                
                    <a href="../../file/part06/6.3.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>6.3.</b>
                        
                        源码分析参考：Picklecompat
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="6.4" data-path="file/part06/6.4.html">
            
                
                    <a href="../../file/part06/6.4.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>6.4.</b>
                        
                        源码分析参考：Pipelines
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="6.5" data-path="file/part06/6.5.html">
            
                
                    <a href="../../file/part06/6.5.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>6.5.</b>
                        
                        源码分析参考：Queue
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="6.6" data-path="file/part06/6.6.html">
            
                
                    <a href="../../file/part06/6.6.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>6.6.</b>
                        
                        源码分析参考：Scheduler
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="6.7" data-path="file/part06/6.7.html">
            
                
                    <a href="../../file/part06/6.7.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>6.7.</b>
                        
                        源码分析参考：Spider
                    </a>
            
            
        </li>
    

            </ul>
            
        </li>
    
        <li class="chapter " data-level="7" data-path="file/part07/7.html">
            
                
                    <a href="../../file/part07/7.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>7.</b>
                        
                        scrapy-redis实战
                    </a>
            
            
            <ul class="articles">
                
    
        <li class="chapter " data-level="7.1" data-path="file/part07/7.1.html">
            
                
                    <a href="../../file/part07/7.1.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>7.1.</b>
                        
                        源码自带项目说明
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="7.2" data-path="file/part07/7.2.html">
            
                
                    <a href="../../file/part07/7.2.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>7.2.</b>
                        
                        有缘网分布式爬虫项目1
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="7.3" data-path="file/part07/7.3.html">
            
                
                    <a href="../../file/part07/7.3.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>7.3.</b>
                        
                        有缘网分布式爬虫项目2
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="7.4" data-path="file/part07/7.4.html">
            
                
                    <a href="../../file/part07/7.4.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>7.4.</b>
                        
                        处理Redis里的数据
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="7.5" data-path="file/part07/7.5.html">
            
                
                    <a href="../../file/part07/7.5.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>7.5.</b>
                        
                        尝试改写新浪网分类资讯爬虫1
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="7.6" data-path="file/part07/7.6.html">
            
                
                    <a href="../../file/part07/7.6.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>7.6.</b>
                        
                        尝试改写新浪网分类资讯爬虫2
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="7.7" data-path="file/part07/7.7.html">
            
                
                    <a href="../../file/part07/7.7.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>7.7.</b>
                        
                        IT桔子分布式项目1
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="7.8" data-path="file/part07/7.8.html">
            
                
                    <a href="../../file/part07/7.8.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>7.8.</b>
                        
                        IT桔子分布式项目2
                    </a>
            
            
        </li>
    

            </ul>
            
        </li>
    
        <li class="chapter " data-level="8" data-path="file/duanzi/duanzi.html">
            
                
                    <a href="../../file/duanzi/duanzi.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>8.</b>
                        
                        课余段子
                    </a>
            
            
        </li>
    


            
            <li class="divider"></li>
            <li>
                <a href="https://www.gitbook.com" target="blank" class="gitbook-link">
                    Published with GitBook
                </a>
            </li>
            
        </ul>
    </nav>
</div>

    <div class="book-body">
        <div class="body-inner">
            <div class="book-header" role="navigation">
    <!-- Actions Left -->
    

    <!-- Title -->
    <h1>
        <i class="fa fa-circle-o-notch fa-spin"></i>
        <a href="../../" >Python爬虫课程讲义</a>
    </h1>
</div>

            <div class="page-wrapper" tabindex="-1" role="main">
                <div class="page-inner">
                
                
                    <section class="normal" id="section-">
                    
                        <h1 id="css-&#x9009;&#x62E9;&#x5668;&#xFF1A;beautifulsoup4">CSS &#x9009;&#x62E9;&#x5668;&#xFF1A;BeautifulSoup4</h1>
<p>&#x548C; lxml &#x4E00;&#x6837;&#xFF0C;Beautiful Soup &#x4E5F;&#x662F;&#x4E00;&#x4E2A;HTML/XML&#x7684;&#x89E3;&#x6790;&#x5668;&#xFF0C;&#x4E3B;&#x8981;&#x7684;&#x529F;&#x80FD;&#x4E5F;&#x662F;&#x5982;&#x4F55;&#x89E3;&#x6790;&#x548C;&#x63D0;&#x53D6; HTML/XML &#x6570;&#x636E;&#x3002;</p>
<blockquote>
<p>lxml &#x53EA;&#x4F1A;&#x5C40;&#x90E8;&#x904D;&#x5386;&#xFF0C;&#x800C;Beautiful Soup &#x662F;&#x57FA;&#x4E8E;HTML DOM&#x7684;&#xFF0C;&#x4F1A;&#x8F7D;&#x5165;&#x6574;&#x4E2A;&#x6587;&#x6863;&#xFF0C;&#x89E3;&#x6790;&#x6574;&#x4E2A;DOM&#x6811;&#xFF0C;&#x56E0;&#x6B64;&#x65F6;&#x95F4;&#x548C;&#x5185;&#x5B58;&#x5F00;&#x9500;&#x90FD;&#x4F1A;&#x5927;&#x5F88;&#x591A;&#xFF0C;&#x6240;&#x4EE5;&#x6027;&#x80FD;&#x8981;&#x4F4E;&#x4E8E;lxml&#x3002;</p>
<p>BeautifulSoup &#x7528;&#x6765;&#x89E3;&#x6790; HTML &#x6BD4;&#x8F83;&#x7B80;&#x5355;&#xFF0C;API&#x975E;&#x5E38;&#x4EBA;&#x6027;&#x5316;&#xFF0C;&#x652F;&#x6301;<a href="http://www.w3school.com.cn/cssref/css_selectors.asp" target="_blank">CSS&#x9009;&#x62E9;&#x5668;</a>&#x3001;Python&#x6807;&#x51C6;&#x5E93;&#x4E2D;&#x7684;HTML&#x89E3;&#x6790;&#x5668;&#xFF0C;&#x4E5F;&#x652F;&#x6301; lxml &#x7684; XML&#x89E3;&#x6790;&#x5668;&#x3002;</p>
<p>Beautiful Soup 3 &#x76EE;&#x524D;&#x5DF2;&#x7ECF;&#x505C;&#x6B62;&#x5F00;&#x53D1;&#xFF0C;&#x63A8;&#x8350;&#x73B0;&#x5728;&#x7684;&#x9879;&#x76EE;&#x4F7F;&#x7528;Beautiful Soup 4&#x3002;&#x4F7F;&#x7528; pip &#x5B89;&#x88C5;&#x5373;&#x53EF;&#xFF1A;<code>pip install beautifulsoup4</code></p>
<p>&#x5B98;&#x65B9;&#x6587;&#x6863;&#xFF1A;<a href="http://beautifulsoup.readthedocs.io/zh_CN/v4.4.0/" target="_blank">http://beautifulsoup.readthedocs.io/zh_CN/v4.4.0</a></p>
</blockquote>
<table>
<thead>
<tr>
<th style="text-align:center">&#x6293;&#x53D6;&#x5DE5;&#x5177;</th>
<th style="text-align:center">&#x901F;&#x5EA6;</th>
<th style="text-align:center">&#x4F7F;&#x7528;&#x96BE;&#x5EA6;</th>
<th style="text-align:center">&#x5B89;&#x88C5;&#x96BE;&#x5EA6;</th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align:center">&#x6B63;&#x5219;</td>
<td style="text-align:center">&#x6700;&#x5FEB;</td>
<td style="text-align:center">&#x56F0;&#x96BE;</td>
<td style="text-align:center">&#x65E0;&#xFF08;&#x5185;&#x7F6E;&#xFF09;</td>
</tr>
<tr>
<td style="text-align:center">BeautifulSoup</td>
<td style="text-align:center">&#x6162;</td>
<td style="text-align:center">&#x6700;&#x7B80;&#x5355;</td>
<td style="text-align:center">&#x7B80;&#x5355;</td>
</tr>
<tr>
<td style="text-align:center">lxml</td>
<td style="text-align:center">&#x5FEB;</td>
<td style="text-align:center">&#x7B80;&#x5355;</td>
<td style="text-align:center">&#x4E00;&#x822C;</td>
</tr>
</tbody>
</table>
<hr>
<h2 id="&#x793A;&#x4F8B;&#xFF1A;">&#x793A;&#x4F8B;&#xFF1A;</h2>
<p>&#x9996;&#x5148;&#x5FC5;&#x987B;&#x8981;&#x5BFC;&#x5165; bs4 &#x5E93;</p>
<pre><code class="lang-python"><span class="hljs-comment"># beautifulsoup4_test.py</span>

<span class="hljs-keyword">from</span> bs4 <span class="hljs-keyword">import</span> BeautifulSoup

html = <span class="hljs-string">&quot;&quot;&quot;
&lt;html&gt;&lt;head&gt;&lt;title&gt;The Dormouse&apos;s story&lt;/title&gt;&lt;/head&gt;
&lt;body&gt;
&lt;p class=&quot;title&quot; name=&quot;dromouse&quot;&gt;&lt;b&gt;The Dormouse&apos;s story&lt;/b&gt;&lt;/p&gt;
&lt;p class=&quot;story&quot;&gt;Once upon a time there were three little sisters; and their names were
&lt;a href=&quot;http://example.com/elsie&quot; class=&quot;sister&quot; id=&quot;link1&quot;&gt;&lt;!-- Elsie --&gt;&lt;/a&gt;,
&lt;a href=&quot;http://example.com/lacie&quot; class=&quot;sister&quot; id=&quot;link2&quot;&gt;Lacie&lt;/a&gt; and
&lt;a href=&quot;http://example.com/tillie&quot; class=&quot;sister&quot; id=&quot;link3&quot;&gt;Tillie&lt;/a&gt;;
and they lived at the bottom of a well.&lt;/p&gt;
&lt;p class=&quot;story&quot;&gt;...&lt;/p&gt;
&quot;&quot;&quot;</span>

<span class="hljs-comment">#&#x521B;&#x5EFA; Beautiful Soup &#x5BF9;&#x8C61;</span>
soup = BeautifulSoup(html)

<span class="hljs-comment">#&#x6253;&#x5F00;&#x672C;&#x5730; HTML &#x6587;&#x4EF6;&#x7684;&#x65B9;&#x5F0F;&#x6765;&#x521B;&#x5EFA;&#x5BF9;&#x8C61;</span>
<span class="hljs-comment">#soup = BeautifulSoup(open(&apos;index.html&apos;))</span>

<span class="hljs-comment">#&#x683C;&#x5F0F;&#x5316;&#x8F93;&#x51FA; soup &#x5BF9;&#x8C61;&#x7684;&#x5185;&#x5BB9;</span>
<span class="hljs-keyword">print</span> soup.prettify()
</code></pre>
<p>&#x8FD0;&#x884C;&#x7ED3;&#x679C;&#xFF1A;</p>
<pre><code>&lt;html&gt;
 &lt;head&gt;
  &lt;title&gt;
   The Dormouse&apos;s story
  &lt;/title&gt;
 &lt;/head&gt;
 &lt;body&gt;
  &lt;p class=&quot;title&quot; name=&quot;dromouse&quot;&gt;
   &lt;b&gt;
    The Dormouse&apos;s story
   &lt;/b&gt;
  &lt;/p&gt;
  &lt;p class=&quot;story&quot;&gt;
   Once upon a time there were three little sisters; and their names were
   &lt;a class=&quot;sister&quot; href=&quot;http://example.com/elsie&quot; id=&quot;link1&quot;&gt;
    &lt;!-- Elsie --&gt;
   &lt;/a&gt;
   ,
   &lt;a class=&quot;sister&quot; href=&quot;http://example.com/lacie&quot; id=&quot;link2&quot;&gt;
    Lacie
   &lt;/a&gt;
   and
   &lt;a class=&quot;sister&quot; href=&quot;http://example.com/tillie&quot; id=&quot;link3&quot;&gt;
    Tillie
   &lt;/a&gt;
   ;
and they lived at the bottom of a well.
  &lt;/p&gt;
  &lt;p class=&quot;story&quot;&gt;
   ...
  &lt;/p&gt;
 &lt;/body&gt;
&lt;/html&gt;
</code></pre><blockquote>
<ul>
<li><p>&#x5982;&#x679C;&#x6211;&#x4EEC;&#x5728; IPython2 &#x4E0B;&#x6267;&#x884C;&#xFF0C;&#x4F1A;&#x770B;&#x5230;&#x8FD9;&#x6837;&#x4E00;&#x6BB5;&#x8B66;&#x544A;&#xFF1A;
<img src="../images/beautiful_lxml.png" alt=""></p>
</li>
<li><p>&#x610F;&#x601D;&#x662F;&#xFF0C;&#x5982;&#x679C;&#x6211;&#x4EEC;&#x6CA1;&#x6709;&#x663E;&#x5F0F;&#x5730;&#x6307;&#x5B9A;&#x89E3;&#x6790;&#x5668;&#xFF0C;&#x6240;&#x4EE5;&#x9ED8;&#x8BA4;&#x4F7F;&#x7528;&#x8FD9;&#x4E2A;&#x7CFB;&#x7EDF;&#x7684;&#x6700;&#x4F73;&#x53EF;&#x7528;HTML&#x89E3;&#x6790;&#x5668;(&#x201C;lxml&#x201D;)&#x3002;&#x5982;&#x679C;&#x4F60;&#x5728;&#x53E6;&#x4E00;&#x4E2A;&#x7CFB;&#x7EDF;&#x4E2D;&#x8FD0;&#x884C;&#x8FD9;&#x6BB5;&#x4EE3;&#x7801;&#xFF0C;&#x6216;&#x8005;&#x5728;&#x4E0D;&#x540C;&#x7684;&#x865A;&#x62DF;&#x73AF;&#x5883;&#x4E2D;&#xFF0C;&#x4F7F;&#x7528;&#x4E0D;&#x540C;&#x7684;&#x89E3;&#x6790;&#x5668;&#x9020;&#x6210;&#x884C;&#x4E3A;&#x4E0D;&#x540C;&#x3002;</p>
</li>
<li>&#x4F46;&#x662F;&#x6211;&#x4EEC;&#x53EF;&#x4EE5;&#x901A;&#x8FC7;<code>soup = BeautifulSoup(html,&#x201C;lxml&#x201D;)</code>&#x65B9;&#x5F0F;&#x6307;&#x5B9A;lxml&#x89E3;&#x6790;&#x5668;&#x3002;</li>
</ul>
</blockquote>
<h2 id="&#x56DB;&#x5927;&#x5BF9;&#x8C61;&#x79CD;&#x7C7B;">&#x56DB;&#x5927;&#x5BF9;&#x8C61;&#x79CD;&#x7C7B;</h2>
<p>Beautiful Soup&#x5C06;&#x590D;&#x6742;HTML&#x6587;&#x6863;&#x8F6C;&#x6362;&#x6210;&#x4E00;&#x4E2A;&#x590D;&#x6742;&#x7684;&#x6811;&#x5F62;&#x7ED3;&#x6784;,&#x6BCF;&#x4E2A;&#x8282;&#x70B9;&#x90FD;&#x662F;Python&#x5BF9;&#x8C61;,&#x6240;&#x6709;&#x5BF9;&#x8C61;&#x53EF;&#x4EE5;&#x5F52;&#x7EB3;&#x4E3A;4&#x79CD;:</p>
<ul>
<li>Tag</li>
<li>NavigableString</li>
<li>BeautifulSoup</li>
<li>Comment</li>
</ul>
<h3 id="1-tag">1. Tag</h3>
<p>Tag &#x901A;&#x4FD7;&#x70B9;&#x8BB2;&#x5C31;&#x662F; HTML &#x4E2D;&#x7684;&#x4E00;&#x4E2A;&#x4E2A;&#x6807;&#x7B7E;&#xFF0C;&#x4F8B;&#x5982;&#xFF1A;</p>
<pre><code class="lang-html"><span class="hljs-tag">&lt;<span class="hljs-title">head</span>&gt;</span><span class="hljs-tag">&lt;<span class="hljs-title">title</span>&gt;</span>The Dormouse&apos;s story<span class="hljs-tag">&lt;/<span class="hljs-title">title</span>&gt;</span><span class="hljs-tag">&lt;/<span class="hljs-title">head</span>&gt;</span>
<span class="hljs-tag">&lt;<span class="hljs-title">a</span> <span class="hljs-attribute">class</span>=<span class="hljs-value">&quot;sister&quot;</span> <span class="hljs-attribute">href</span>=<span class="hljs-value">&quot;http://example.com/elsie&quot;</span> <span class="hljs-attribute">id</span>=<span class="hljs-value">&quot;link1&quot;</span>&gt;</span><span class="hljs-comment">&lt;!-- Elsie --&gt;</span><span class="hljs-tag">&lt;/<span class="hljs-title">a</span>&gt;</span>
<span class="hljs-tag">&lt;<span class="hljs-title">p</span> <span class="hljs-attribute">class</span>=<span class="hljs-value">&quot;title&quot;</span> <span class="hljs-attribute">name</span>=<span class="hljs-value">&quot;dromouse&quot;</span>&gt;</span><span class="hljs-tag">&lt;<span class="hljs-title">b</span>&gt;</span>The Dormouse&apos;s story<span class="hljs-tag">&lt;/<span class="hljs-title">b</span>&gt;</span><span class="hljs-tag">&lt;/<span class="hljs-title">p</span>&gt;</span>
</code></pre>
<p>&#x4E0A;&#x9762;&#x7684; <code>title</code> <code>head</code>  <code>a</code> <code>p</code>&#x7B49;&#x7B49; HTML &#x6807;&#x7B7E;&#x52A0;&#x4E0A;&#x91CC;&#x9762;&#x5305;&#x62EC;&#x7684;&#x5185;&#x5BB9;&#x5C31;&#x662F; Tag&#xFF0C;&#x90A3;&#x4E48;&#x8BD5;&#x7740;&#x4F7F;&#x7528; Beautiful Soup &#x6765;&#x83B7;&#x53D6; Tags:</p>
<pre><code class="lang-python"><span class="hljs-keyword">from</span> bs4 <span class="hljs-keyword">import</span> BeautifulSoup

html = <span class="hljs-string">&quot;&quot;&quot;
&lt;html&gt;&lt;head&gt;&lt;title&gt;The Dormouse&apos;s story&lt;/title&gt;&lt;/head&gt;
&lt;body&gt;
&lt;p class=&quot;title&quot; name=&quot;dromouse&quot;&gt;&lt;b&gt;The Dormouse&apos;s story&lt;/b&gt;&lt;/p&gt;
&lt;p class=&quot;story&quot;&gt;Once upon a time there were three little sisters; and their names were
&lt;a href=&quot;http://example.com/elsie&quot; class=&quot;sister&quot; id=&quot;link1&quot;&gt;&lt;!-- Elsie --&gt;&lt;/a&gt;,
&lt;a href=&quot;http://example.com/lacie&quot; class=&quot;sister&quot; id=&quot;link2&quot;&gt;Lacie&lt;/a&gt; and
&lt;a href=&quot;http://example.com/tillie&quot; class=&quot;sister&quot; id=&quot;link3&quot;&gt;Tillie&lt;/a&gt;;
and they lived at the bottom of a well.&lt;/p&gt;
&lt;p class=&quot;story&quot;&gt;...&lt;/p&gt;
&quot;&quot;&quot;</span>

<span class="hljs-comment">#&#x521B;&#x5EFA; Beautiful Soup &#x5BF9;&#x8C61;</span>
soup = BeautifulSoup(html)


<span class="hljs-keyword">print</span> soup.title
<span class="hljs-comment"># &lt;title&gt;The Dormouse&apos;s story&lt;/title&gt;</span>

<span class="hljs-keyword">print</span> soup.head
<span class="hljs-comment"># &lt;head&gt;&lt;title&gt;The Dormouse&apos;s story&lt;/title&gt;&lt;/head&gt;</span>

<span class="hljs-keyword">print</span> soup.a
<span class="hljs-comment"># &lt;a class=&quot;sister&quot; href=&quot;http://example.com/elsie&quot; id=&quot;link1&quot;&gt;&lt;!-- Elsie --&gt;&lt;/a&gt;</span>

<span class="hljs-keyword">print</span> soup.p
<span class="hljs-comment"># &lt;p class=&quot;title&quot; name=&quot;dromouse&quot;&gt;&lt;b&gt;The Dormouse&apos;s story&lt;/b&gt;&lt;/p&gt;</span>

<span class="hljs-keyword">print</span> type(soup.p)
<span class="hljs-comment"># &lt;class &apos;bs4.element.Tag&apos;&gt;</span>
</code></pre>
<p>&#x6211;&#x4EEC;&#x53EF;&#x4EE5;&#x5229;&#x7528; soup &#x52A0;&#x6807;&#x7B7E;&#x540D;&#x8F7B;&#x677E;&#x5730;&#x83B7;&#x53D6;&#x8FD9;&#x4E9B;&#x6807;&#x7B7E;&#x7684;&#x5185;&#x5BB9;&#xFF0C;&#x8FD9;&#x4E9B;&#x5BF9;&#x8C61;&#x7684;&#x7C7B;&#x578B;&#x662F;<code>bs4.element.Tag</code>&#x3002;&#x4F46;&#x662F;&#x6CE8;&#x610F;&#xFF0C;&#x5B83;&#x67E5;&#x627E;&#x7684;&#x662F;&#x5728;&#x6240;&#x6709;&#x5185;&#x5BB9;&#x4E2D;&#x7684;&#x7B2C;&#x4E00;&#x4E2A;&#x7B26;&#x5408;&#x8981;&#x6C42;&#x7684;&#x6807;&#x7B7E;&#x3002;&#x5982;&#x679C;&#x8981;&#x67E5;&#x8BE2;&#x6240;&#x6709;&#x7684;&#x6807;&#x7B7E;&#xFF0C;&#x540E;&#x9762;&#x4F1A;&#x8FDB;&#x884C;&#x4ECB;&#x7ECD;&#x3002;</p>
<h5 id="&#x5BF9;&#x4E8E;-tag&#xFF0C;&#x5B83;&#x6709;&#x4E24;&#x4E2A;&#x91CD;&#x8981;&#x7684;&#x5C5E;&#x6027;&#xFF0C;&#x662F;-name-&#x548C;-attrs">&#x5BF9;&#x4E8E; Tag&#xFF0C;&#x5B83;&#x6709;&#x4E24;&#x4E2A;&#x91CD;&#x8981;&#x7684;&#x5C5E;&#x6027;&#xFF0C;&#x662F; name &#x548C; attrs</h5>
<pre><code class="lang-python"><span class="hljs-keyword">print</span> soup.name
<span class="hljs-comment"># [document] #soup &#x5BF9;&#x8C61;&#x672C;&#x8EAB;&#x6BD4;&#x8F83;&#x7279;&#x6B8A;&#xFF0C;&#x5B83;&#x7684; name &#x5373;&#x4E3A; [document]</span>

<span class="hljs-keyword">print</span> soup.head.name
<span class="hljs-comment"># head #&#x5BF9;&#x4E8E;&#x5176;&#x4ED6;&#x5185;&#x90E8;&#x6807;&#x7B7E;&#xFF0C;&#x8F93;&#x51FA;&#x7684;&#x503C;&#x4FBF;&#x4E3A;&#x6807;&#x7B7E;&#x672C;&#x8EAB;&#x7684;&#x540D;&#x79F0;</span>

<span class="hljs-keyword">print</span> soup.p.attrs
<span class="hljs-comment"># {&apos;class&apos;: [&apos;title&apos;], &apos;name&apos;: &apos;dromouse&apos;}</span>
<span class="hljs-comment"># &#x5728;&#x8FD9;&#x91CC;&#xFF0C;&#x6211;&#x4EEC;&#x628A; p &#x6807;&#x7B7E;&#x7684;&#x6240;&#x6709;&#x5C5E;&#x6027;&#x6253;&#x5370;&#x8F93;&#x51FA;&#x4E86;&#x51FA;&#x6765;&#xFF0C;&#x5F97;&#x5230;&#x7684;&#x7C7B;&#x578B;&#x662F;&#x4E00;&#x4E2A;&#x5B57;&#x5178;&#x3002;</span>

<span class="hljs-keyword">print</span> soup.p[<span class="hljs-string">&apos;class&apos;</span>] <span class="hljs-comment"># soup.p.get(&apos;class&apos;)</span>
<span class="hljs-comment"># [&apos;title&apos;] #&#x8FD8;&#x53EF;&#x4EE5;&#x5229;&#x7528;get&#x65B9;&#x6CD5;&#xFF0C;&#x4F20;&#x5165;&#x5C5E;&#x6027;&#x7684;&#x540D;&#x79F0;&#xFF0C;&#x4E8C;&#x8005;&#x662F;&#x7B49;&#x4EF7;&#x7684;</span>

soup.p[<span class="hljs-string">&apos;class&apos;</span>] = <span class="hljs-string">&quot;newClass&quot;</span>
<span class="hljs-keyword">print</span> soup.p <span class="hljs-comment"># &#x53EF;&#x4EE5;&#x5BF9;&#x8FD9;&#x4E9B;&#x5C5E;&#x6027;&#x548C;&#x5185;&#x5BB9;&#x7B49;&#x7B49;&#x8FDB;&#x884C;&#x4FEE;&#x6539;</span>
<span class="hljs-comment"># &lt;p class=&quot;newClass&quot; name=&quot;dromouse&quot;&gt;&lt;b&gt;The Dormouse&apos;s story&lt;/b&gt;&lt;/p&gt;</span>

<span class="hljs-keyword">del</span> soup.p[<span class="hljs-string">&apos;class&apos;</span>] <span class="hljs-comment"># &#x8FD8;&#x53EF;&#x4EE5;&#x5BF9;&#x8FD9;&#x4E2A;&#x5C5E;&#x6027;&#x8FDB;&#x884C;&#x5220;&#x9664;</span>
<span class="hljs-keyword">print</span> soup.p
<span class="hljs-comment"># &lt;p name=&quot;dromouse&quot;&gt;&lt;b&gt;The Dormouse&apos;s story&lt;/b&gt;&lt;/p&gt;</span>
</code></pre>
<h3 id="2-navigablestring">2. NavigableString</h3>
<p>&#x65E2;&#x7136;&#x6211;&#x4EEC;&#x5DF2;&#x7ECF;&#x5F97;&#x5230;&#x4E86;&#x6807;&#x7B7E;&#x7684;&#x5185;&#x5BB9;&#xFF0C;&#x90A3;&#x4E48;&#x95EE;&#x9898;&#x6765;&#x4E86;&#xFF0C;&#x6211;&#x4EEC;&#x8981;&#x60F3;&#x83B7;&#x53D6;&#x6807;&#x7B7E;&#x5185;&#x90E8;&#x7684;&#x6587;&#x5B57;&#x600E;&#x4E48;&#x529E;&#x5462;&#xFF1F;&#x5F88;&#x7B80;&#x5355;&#xFF0C;&#x7528; .string &#x5373;&#x53EF;&#xFF0C;&#x4F8B;&#x5982;</p>
<pre><code class="lang-python"><span class="hljs-keyword">print</span> soup.p.string
<span class="hljs-comment"># The Dormouse&apos;s story</span>

<span class="hljs-keyword">print</span> type(soup.p.string)
<span class="hljs-comment"># In [13]: &lt;class &apos;bs4.element.NavigableString&apos;&gt;</span>
</code></pre>
<h3 id="3-beautifulsoup">3. BeautifulSoup</h3>
<p>BeautifulSoup &#x5BF9;&#x8C61;&#x8868;&#x793A;&#x7684;&#x662F;&#x4E00;&#x4E2A;&#x6587;&#x6863;&#x7684;&#x5185;&#x5BB9;&#x3002;&#x5927;&#x90E8;&#x5206;&#x65F6;&#x5019;,&#x53EF;&#x4EE5;&#x628A;&#x5B83;&#x5F53;&#x4F5C; Tag &#x5BF9;&#x8C61;&#xFF0C;&#x662F;&#x4E00;&#x4E2A;&#x7279;&#x6B8A;&#x7684; Tag&#xFF0C;&#x6211;&#x4EEC;&#x53EF;&#x4EE5;&#x5206;&#x522B;&#x83B7;&#x53D6;&#x5B83;&#x7684;&#x7C7B;&#x578B;&#xFF0C;&#x540D;&#x79F0;&#xFF0C;&#x4EE5;&#x53CA;&#x5C5E;&#x6027;&#x6765;&#x611F;&#x53D7;&#x4E00;&#x4E0B;</p>
<pre><code class="lang-python"><span class="hljs-keyword">print</span> type(soup.name)
<span class="hljs-comment"># &lt;type &apos;unicode&apos;&gt;</span>

<span class="hljs-keyword">print</span> soup.name 
<span class="hljs-comment"># [document]</span>

<span class="hljs-keyword">print</span> soup.attrs <span class="hljs-comment"># &#x6587;&#x6863;&#x672C;&#x8EAB;&#x7684;&#x5C5E;&#x6027;&#x4E3A;&#x7A7A;</span>
<span class="hljs-comment"># {}</span>
</code></pre>
<h3 id="4-comment">4. Comment</h3>
<p>Comment &#x5BF9;&#x8C61;&#x662F;&#x4E00;&#x4E2A;&#x7279;&#x6B8A;&#x7C7B;&#x578B;&#x7684; NavigableString &#x5BF9;&#x8C61;&#xFF0C;&#x5176;&#x8F93;&#x51FA;&#x7684;&#x5185;&#x5BB9;&#x4E0D;&#x5305;&#x62EC;&#x6CE8;&#x91CA;&#x7B26;&#x53F7;&#x3002;</p>
<pre><code class="lang-python"><span class="hljs-keyword">print</span> soup.a
<span class="hljs-comment"># &lt;a class=&quot;sister&quot; href=&quot;http://example.com/elsie&quot; id=&quot;link1&quot;&gt;&lt;!-- Elsie --&gt;&lt;/a&gt;</span>

<span class="hljs-keyword">print</span> soup.a.string
<span class="hljs-comment"># Elsie </span>

<span class="hljs-keyword">print</span> type(soup.a.string)
<span class="hljs-comment"># &lt;class &apos;bs4.element.Comment&apos;&gt;</span>
</code></pre>
<p>a &#x6807;&#x7B7E;&#x91CC;&#x7684;&#x5185;&#x5BB9;&#x5B9E;&#x9645;&#x4E0A;&#x662F;&#x6CE8;&#x91CA;&#xFF0C;&#x4F46;&#x662F;&#x5982;&#x679C;&#x6211;&#x4EEC;&#x5229;&#x7528; .string &#x6765;&#x8F93;&#x51FA;&#x5B83;&#x7684;&#x5185;&#x5BB9;&#x65F6;&#xFF0C;&#x6CE8;&#x91CA;&#x7B26;&#x53F7;&#x5DF2;&#x7ECF;&#x53BB;&#x6389;&#x4E86;&#x3002;</p>
<h2 id="&#x904D;&#x5386;&#x6587;&#x6863;&#x6811;">&#x904D;&#x5386;&#x6587;&#x6863;&#x6811;</h2>
<h3 id="1-&#x76F4;&#x63A5;&#x5B50;&#x8282;&#x70B9;-&#xFF1A;contents-children--&#x5C5E;&#x6027;">1. &#x76F4;&#x63A5;&#x5B50;&#x8282;&#x70B9; &#xFF1A;<code>.contents</code> <code>.children</code>  &#x5C5E;&#x6027;</h3>
<h4 id="content">.content</h4>
<p>tag &#x7684; .content &#x5C5E;&#x6027;&#x53EF;&#x4EE5;&#x5C06;tag&#x7684;&#x5B50;&#x8282;&#x70B9;&#x4EE5;&#x5217;&#x8868;&#x7684;&#x65B9;&#x5F0F;&#x8F93;&#x51FA;</p>
<pre><code class="lang-python"><span class="hljs-keyword">print</span> soup.head.contents 
<span class="hljs-comment">#[&lt;title&gt;The Dormouse&apos;s story&lt;/title&gt;]</span>
</code></pre>
<p>&#x8F93;&#x51FA;&#x65B9;&#x5F0F;&#x4E3A;&#x5217;&#x8868;&#xFF0C;&#x6211;&#x4EEC;&#x53EF;&#x4EE5;&#x7528;&#x5217;&#x8868;&#x7D22;&#x5F15;&#x6765;&#x83B7;&#x53D6;&#x5B83;&#x7684;&#x67D0;&#x4E00;&#x4E2A;&#x5143;&#x7D20;</p>
<pre><code class="lang-python"><span class="hljs-keyword">print</span> soup.head.contents[<span class="hljs-number">0</span>]
<span class="hljs-comment">#&lt;title&gt;The Dormouse&apos;s story&lt;/title&gt;</span>
</code></pre>
<h4 id="children">.children</h4>
<p>&#x5B83;&#x8FD4;&#x56DE;&#x7684;&#x4E0D;&#x662F;&#x4E00;&#x4E2A; list&#xFF0C;&#x4E0D;&#x8FC7;&#x6211;&#x4EEC;&#x53EF;&#x4EE5;&#x901A;&#x8FC7;&#x904D;&#x5386;&#x83B7;&#x53D6;&#x6240;&#x6709;&#x5B50;&#x8282;&#x70B9;&#x3002;</p>
<p>&#x6211;&#x4EEC;&#x6253;&#x5370;&#x8F93;&#x51FA; .children &#x770B;&#x4E00;&#x4E0B;&#xFF0C;&#x53EF;&#x4EE5;&#x53D1;&#x73B0;&#x5B83;&#x662F;&#x4E00;&#x4E2A; list &#x751F;&#x6210;&#x5668;&#x5BF9;&#x8C61;</p>
<pre><code class="lang-python"><span class="hljs-keyword">print</span> soup.head.children
<span class="hljs-comment">#&lt;listiterator object at 0x7f71457f5710&gt;</span>

<span class="hljs-keyword">for</span> child <span class="hljs-keyword">in</span>  soup.body.children:
    <span class="hljs-keyword">print</span> child
</code></pre>
<p>&#x7ED3;&#x679C;:</p>
<pre><code class="lang-html"><span class="hljs-tag">&lt;<span class="hljs-title">p</span> <span class="hljs-attribute">class</span>=<span class="hljs-value">&quot;title&quot;</span> <span class="hljs-attribute">name</span>=<span class="hljs-value">&quot;dromouse&quot;</span>&gt;</span><span class="hljs-tag">&lt;<span class="hljs-title">b</span>&gt;</span>The Dormouse&apos;s story<span class="hljs-tag">&lt;/<span class="hljs-title">b</span>&gt;</span><span class="hljs-tag">&lt;/<span class="hljs-title">p</span>&gt;</span>

<span class="hljs-tag">&lt;<span class="hljs-title">p</span> <span class="hljs-attribute">class</span>=<span class="hljs-value">&quot;story&quot;</span>&gt;</span>Once upon a time there were three little sisters; and their names were
<span class="hljs-tag">&lt;<span class="hljs-title">a</span> <span class="hljs-attribute">class</span>=<span class="hljs-value">&quot;sister&quot;</span> <span class="hljs-attribute">href</span>=<span class="hljs-value">&quot;http://example.com/elsie&quot;</span> <span class="hljs-attribute">id</span>=<span class="hljs-value">&quot;link1&quot;</span>&gt;</span><span class="hljs-comment">&lt;!-- Elsie --&gt;</span><span class="hljs-tag">&lt;/<span class="hljs-title">a</span>&gt;</span>,
<span class="hljs-tag">&lt;<span class="hljs-title">a</span> <span class="hljs-attribute">class</span>=<span class="hljs-value">&quot;sister&quot;</span> <span class="hljs-attribute">href</span>=<span class="hljs-value">&quot;http://example.com/lacie&quot;</span> <span class="hljs-attribute">id</span>=<span class="hljs-value">&quot;link2&quot;</span>&gt;</span>Lacie<span class="hljs-tag">&lt;/<span class="hljs-title">a</span>&gt;</span> and
<span class="hljs-tag">&lt;<span class="hljs-title">a</span> <span class="hljs-attribute">class</span>=<span class="hljs-value">&quot;sister&quot;</span> <span class="hljs-attribute">href</span>=<span class="hljs-value">&quot;http://example.com/tillie&quot;</span> <span class="hljs-attribute">id</span>=<span class="hljs-value">&quot;link3&quot;</span>&gt;</span>Tillie<span class="hljs-tag">&lt;/<span class="hljs-title">a</span>&gt;</span>;
and they lived at the bottom of a well.<span class="hljs-tag">&lt;/<span class="hljs-title">p</span>&gt;</span>

<span class="hljs-tag">&lt;<span class="hljs-title">p</span> <span class="hljs-attribute">class</span>=<span class="hljs-value">&quot;story&quot;</span>&gt;</span>...<span class="hljs-tag">&lt;/<span class="hljs-title">p</span>&gt;</span>
</code></pre>
<h3 id="2-&#x6240;&#x6709;&#x5B50;&#x5B59;&#x8282;&#x70B9;-descendants--&#x5C5E;&#x6027;">2. &#x6240;&#x6709;&#x5B50;&#x5B59;&#x8282;&#x70B9;: <code>.descendants</code>  &#x5C5E;&#x6027;</h3>
<p>.contents &#x548C; .children &#x5C5E;&#x6027;&#x4EC5;&#x5305;&#x542B;tag&#x7684;&#x76F4;&#x63A5;&#x5B50;&#x8282;&#x70B9;&#xFF0C;.descendants &#x5C5E;&#x6027;&#x53EF;&#x4EE5;&#x5BF9;&#x6240;&#x6709;tag&#x7684;&#x5B50;&#x5B59;&#x8282;&#x70B9;&#x8FDB;&#x884C;&#x9012;&#x5F52;&#x5FAA;&#x73AF;&#xFF0C;&#x548C; children&#x7C7B;&#x4F3C;&#xFF0C;&#x6211;&#x4EEC;&#x4E5F;&#x9700;&#x8981;&#x904D;&#x5386;&#x83B7;&#x53D6;&#x5176;&#x4E2D;&#x7684;&#x5185;&#x5BB9;&#x3002;</p>
<pre><code class="lang-python"><span class="hljs-keyword">for</span> child <span class="hljs-keyword">in</span> soup.descendants:
    <span class="hljs-keyword">print</span> child
</code></pre>
<p>&#x8FD0;&#x884C;&#x7ED3;&#x679C;&#xFF1A;</p>
<pre><code class="lang-python">&lt;html&gt;&lt;head&gt;&lt;title&gt;The Dormouse&apos;s story&lt;/title&gt;&lt;/head&gt;
&lt;body&gt;
&lt;p class=&quot;title&quot; name=&quot;dromouse&quot;&gt;&lt;b&gt;The Dormouse&apos;s story&lt;/b&gt;&lt;/p&gt;
&lt;p class=&quot;story&quot;&gt;Once upon a time there were three little sisters; and their names were
&lt;a class=&quot;sister&quot; href=&quot;http://example.com/elsie&quot; id=&quot;link1&quot;&gt;&lt;!-- Elsie --&gt;&lt;/a&gt;,
&lt;a class=&quot;sister&quot; href=&quot;http://example.com/lacie&quot; id=&quot;link2&quot;&gt;Lacie&lt;/a&gt; and
&lt;a class=&quot;sister&quot; href=&quot;http://example.com/tillie&quot; id=&quot;link3&quot;&gt;Tillie&lt;/a&gt;;
and they lived at the bottom of a well.&lt;/p&gt;
&lt;p class=&quot;story&quot;&gt;...&lt;/p&gt;
&lt;/body&gt;&lt;/html&gt;
&lt;head&gt;&lt;title&gt;The Dormouse&apos;s story&lt;/title&gt;&lt;/head&gt;
&lt;title&gt;The Dormouse&apos;s story&lt;/title&gt;
The Dormouse&apos;s story


&lt;body&gt;
&lt;p class=&quot;title&quot; name=&quot;dromouse&quot;&gt;&lt;b&gt;The Dormouse&apos;s story&lt;/b&gt;&lt;/p&gt;
&lt;p class=&quot;story&quot;&gt;Once upon a time there were three little sisters; and their names were
&lt;a class=&quot;sister&quot; href=&quot;http://example.com/elsie&quot; id=&quot;link1&quot;&gt;&lt;!-- Elsie --&gt;&lt;/a&gt;,
&lt;a class=&quot;sister&quot; href=&quot;http://example.com/lacie&quot; id=&quot;link2&quot;&gt;Lacie&lt;/a&gt; and
&lt;a class=&quot;sister&quot; href=&quot;http://example.com/tillie&quot; id=&quot;link3&quot;&gt;Tillie&lt;/a&gt;;
and they lived at the bottom of a well.&lt;/p&gt;
&lt;p class=&quot;story&quot;&gt;...&lt;/p&gt;
&lt;/body&gt;


&lt;p class=&quot;title&quot; name=&quot;dromouse&quot;&gt;&lt;b&gt;The Dormouse&apos;s story&lt;/b&gt;&lt;/p&gt;
&lt;b&gt;The Dormouse&apos;s story&lt;/b&gt;
The Dormouse&apos;s story


&lt;p class=&quot;story&quot;&gt;Once upon a time there were three little sisters; and their names were
&lt;a class=&quot;sister&quot; href=&quot;http://example.com/elsie&quot; id=&quot;link1&quot;&gt;&lt;!-- Elsie --&gt;&lt;/a&gt;,
&lt;a class=&quot;sister&quot; href=&quot;http://example.com/lacie&quot; id=&quot;link2&quot;&gt;Lacie&lt;/a&gt; and
&lt;a class=&quot;sister&quot; href=&quot;http://example.com/tillie&quot; id=&quot;link3&quot;&gt;Tillie&lt;/a&gt;;
and they lived at the bottom of a well.&lt;/p&gt;
Once upon a time there were three little sisters; and their names were

&lt;a class=&quot;sister&quot; href=&quot;http://example.com/elsie&quot; id=&quot;link1&quot;&gt;&lt;!-- Elsie --&gt;&lt;/a&gt;
 Elsie 
,

&lt;a class=&quot;sister&quot; href=&quot;http://example.com/lacie&quot; id=&quot;link2&quot;&gt;Lacie&lt;/a&gt;
Lacie
 and

&lt;a class=&quot;sister&quot; href=&quot;http://example.com/tillie&quot; id=&quot;link3&quot;&gt;Tillie&lt;/a&gt;
Tillie
;
and they lived at the bottom of a well.


&lt;p class=&quot;story&quot;&gt;...&lt;/p&gt;
...
</code></pre>
<h3 id="3-&#x8282;&#x70B9;&#x5185;&#x5BB9;-string-&#x5C5E;&#x6027;">3. &#x8282;&#x70B9;&#x5185;&#x5BB9;: <code>.string</code> &#x5C5E;&#x6027;</h3>
<p>&#x5982;&#x679C;tag&#x53EA;&#x6709;&#x4E00;&#x4E2A; NavigableString &#x7C7B;&#x578B;&#x5B50;&#x8282;&#x70B9;,&#x90A3;&#x4E48;&#x8FD9;&#x4E2A;tag&#x53EF;&#x4EE5;&#x4F7F;&#x7528; .string &#x5F97;&#x5230;&#x5B50;&#x8282;&#x70B9;&#x3002;&#x5982;&#x679C;&#x4E00;&#x4E2A;tag&#x4EC5;&#x6709;&#x4E00;&#x4E2A;&#x5B50;&#x8282;&#x70B9;,&#x90A3;&#x4E48;&#x8FD9;&#x4E2A;tag&#x4E5F;&#x53EF;&#x4EE5;&#x4F7F;&#x7528; .string &#x65B9;&#x6CD5;,&#x8F93;&#x51FA;&#x7ED3;&#x679C;&#x4E0E;&#x5F53;&#x524D;&#x552F;&#x4E00;&#x5B50;&#x8282;&#x70B9;&#x7684; .string &#x7ED3;&#x679C;&#x76F8;&#x540C;&#x3002;</p>
<p>&#x901A;&#x4FD7;&#x70B9;&#x8BF4;&#x5C31;&#x662F;&#xFF1A;&#x5982;&#x679C;&#x4E00;&#x4E2A;&#x6807;&#x7B7E;&#x91CC;&#x9762;&#x6CA1;&#x6709;&#x6807;&#x7B7E;&#x4E86;&#xFF0C;&#x90A3;&#x4E48; .string &#x5C31;&#x4F1A;&#x8FD4;&#x56DE;&#x6807;&#x7B7E;&#x91CC;&#x9762;&#x7684;&#x5185;&#x5BB9;&#x3002;&#x5982;&#x679C;&#x6807;&#x7B7E;&#x91CC;&#x9762;&#x53EA;&#x6709;&#x552F;&#x4E00;&#x7684;&#x4E00;&#x4E2A;&#x6807;&#x7B7E;&#x4E86;&#xFF0C;&#x90A3;&#x4E48; .string &#x4E5F;&#x4F1A;&#x8FD4;&#x56DE;&#x6700;&#x91CC;&#x9762;&#x7684;&#x5185;&#x5BB9;&#x3002;&#x4F8B;&#x5982;&#xFF1A;</p>
<pre><code class="lang-python"><span class="hljs-keyword">print</span> soup.head.string
<span class="hljs-comment">#The Dormouse&apos;s story</span>
<span class="hljs-keyword">print</span> soup.title.string
<span class="hljs-comment">#The Dormouse&apos;s story</span>
</code></pre>
<h2 id="&#x641C;&#x7D22;&#x6587;&#x6863;&#x6811;">&#x641C;&#x7D22;&#x6587;&#x6863;&#x6811;</h2>
<h3 id="1findallname-attrs-recursive-text-kwargs">1.<code>find_all(name, attrs, recursive, text, **kwargs)</code></h3>
<h4 id="1&#xFF09;name-&#x53C2;&#x6570;">1&#xFF09;name &#x53C2;&#x6570;</h4>
<p>name &#x53C2;&#x6570;&#x53EF;&#x4EE5;&#x67E5;&#x627E;&#x6240;&#x6709;&#x540D;&#x5B57;&#x4E3A; name &#x7684;tag,&#x5B57;&#x7B26;&#x4E32;&#x5BF9;&#x8C61;&#x4F1A;&#x88AB;&#x81EA;&#x52A8;&#x5FFD;&#x7565;&#x6389;</p>
<h5 id="a&#x4F20;&#x5B57;&#x7B26;&#x4E32;">A.&#x4F20;&#x5B57;&#x7B26;&#x4E32;</h5>
<p>&#x6700;&#x7B80;&#x5355;&#x7684;&#x8FC7;&#x6EE4;&#x5668;&#x662F;&#x5B57;&#x7B26;&#x4E32;.&#x5728;&#x641C;&#x7D22;&#x65B9;&#x6CD5;&#x4E2D;&#x4F20;&#x5165;&#x4E00;&#x4E2A;&#x5B57;&#x7B26;&#x4E32;&#x53C2;&#x6570;,Beautiful Soup&#x4F1A;&#x67E5;&#x627E;&#x4E0E;&#x5B57;&#x7B26;&#x4E32;&#x5B8C;&#x6574;&#x5339;&#x914D;&#x7684;&#x5185;&#x5BB9;,&#x4E0B;&#x9762;&#x7684;&#x4F8B;&#x5B50;&#x7528;&#x4E8E;&#x67E5;&#x627E;&#x6587;&#x6863;&#x4E2D;&#x6240;&#x6709;&#x7684;<code>&lt;b&gt;</code>&#x6807;&#x7B7E;:</p>
<pre><code class="lang-python">soup.find_all(<span class="hljs-string">&apos;b&apos;</span>)
<span class="hljs-comment"># [&lt;b&gt;The Dormouse&apos;s story&lt;/b&gt;]</span>

<span class="hljs-keyword">print</span> soup.find_all(<span class="hljs-string">&apos;a&apos;</span>)
<span class="hljs-comment">#[&lt;a class=&quot;sister&quot; href=&quot;http://example.com/elsie&quot; id=&quot;link1&quot;&gt;&lt;!-- Elsie --&gt;&lt;/a&gt;, &lt;a class=&quot;sister&quot; href=&quot;http://example.com/lacie&quot; id=&quot;link2&quot;&gt;Lacie&lt;/a&gt;, &lt;a class=&quot;sister&quot; href=&quot;http://example.com/tillie&quot; id=&quot;link3&quot;&gt;Tillie&lt;/a&gt;]</span>
</code></pre>
<h5 id="b&#x4F20;&#x6B63;&#x5219;&#x8868;&#x8FBE;&#x5F0F;">B.&#x4F20;&#x6B63;&#x5219;&#x8868;&#x8FBE;&#x5F0F;</h5>
<p>&#x5982;&#x679C;&#x4F20;&#x5165;&#x6B63;&#x5219;&#x8868;&#x8FBE;&#x5F0F;&#x4F5C;&#x4E3A;&#x53C2;&#x6570;,Beautiful Soup&#x4F1A;&#x901A;&#x8FC7;&#x6B63;&#x5219;&#x8868;&#x8FBE;&#x5F0F;&#x7684; match() &#x6765;&#x5339;&#x914D;&#x5185;&#x5BB9;.&#x4E0B;&#x9762;&#x4F8B;&#x5B50;&#x4E2D;&#x627E;&#x51FA;&#x6240;&#x6709;&#x4EE5;b&#x5F00;&#x5934;&#x7684;&#x6807;&#x7B7E;,&#x8FD9;&#x8868;&#x793A;<code>&lt;body&gt;</code>&#x548C;<code>&lt;b&gt;</code>&#x6807;&#x7B7E;&#x90FD;&#x5E94;&#x8BE5;&#x88AB;&#x627E;&#x5230;</p>
<pre><code class="lang-python"><span class="hljs-keyword">import</span> re
<span class="hljs-keyword">for</span> tag <span class="hljs-keyword">in</span> soup.find_all(re.compile(<span class="hljs-string">&quot;^b&quot;</span>)):
    print(tag.name)
<span class="hljs-comment"># body</span>
<span class="hljs-comment"># b</span>
</code></pre>
<h5 id="c&#x4F20;&#x5217;&#x8868;">C.&#x4F20;&#x5217;&#x8868;</h5>
<p>&#x5982;&#x679C;&#x4F20;&#x5165;&#x5217;&#x8868;&#x53C2;&#x6570;,Beautiful Soup&#x4F1A;&#x5C06;&#x4E0E;&#x5217;&#x8868;&#x4E2D;&#x4EFB;&#x4E00;&#x5143;&#x7D20;&#x5339;&#x914D;&#x7684;&#x5185;&#x5BB9;&#x8FD4;&#x56DE;.&#x4E0B;&#x9762;&#x4EE3;&#x7801;&#x627E;&#x5230;&#x6587;&#x6863;&#x4E2D;&#x6240;&#x6709;<code>&lt;a&gt;</code>&#x6807;&#x7B7E;&#x548C;<code>&lt;b&gt;</code>&#x6807;&#x7B7E;:</p>
<pre><code class="lang-python">soup.find_all([<span class="hljs-string">&quot;a&quot;</span>, <span class="hljs-string">&quot;b&quot;</span>])
<span class="hljs-comment"># [&lt;b&gt;The Dormouse&apos;s story&lt;/b&gt;,</span>
<span class="hljs-comment">#  &lt;a class=&quot;sister&quot; href=&quot;http://example.com/elsie&quot; id=&quot;link1&quot;&gt;Elsie&lt;/a&gt;,</span>
<span class="hljs-comment">#  &lt;a class=&quot;sister&quot; href=&quot;http://example.com/lacie&quot; id=&quot;link2&quot;&gt;Lacie&lt;/a&gt;,</span>
<span class="hljs-comment">#  &lt;a class=&quot;sister&quot; href=&quot;http://example.com/tillie&quot; id=&quot;link3&quot;&gt;Tillie&lt;/a&gt;]</span>
</code></pre>
<h4 id="2&#xFF09;keyword-&#x53C2;&#x6570;">2&#xFF09;keyword &#x53C2;&#x6570;</h4>
<pre><code class="lang-python">soup.find_all(id=<span class="hljs-string">&apos;link2&apos;</span>)
<span class="hljs-comment"># [&lt;a class=&quot;sister&quot; href=&quot;http://example.com/lacie&quot; id=&quot;link2&quot;&gt;Lacie&lt;/a&gt;]</span>
</code></pre>
<h4 id="3&#xFF09;text-&#x53C2;&#x6570;">3&#xFF09;text &#x53C2;&#x6570;</h4>
<p>&#x901A;&#x8FC7; text &#x53C2;&#x6570;&#x53EF;&#x4EE5;&#x641C;&#x641C;&#x6587;&#x6863;&#x4E2D;&#x7684;&#x5B57;&#x7B26;&#x4E32;&#x5185;&#x5BB9;&#xFF0C;&#x4E0E; name &#x53C2;&#x6570;&#x7684;&#x53EF;&#x9009;&#x503C;&#x4E00;&#x6837;, text &#x53C2;&#x6570;&#x63A5;&#x53D7; &#x5B57;&#x7B26;&#x4E32; , &#x6B63;&#x5219;&#x8868;&#x8FBE;&#x5F0F; , &#x5217;&#x8868;</p>
<pre><code class="lang-python">soup.find_all(text=<span class="hljs-string">&quot;Elsie&quot;</span>)
<span class="hljs-comment"># [u&apos;Elsie&apos;]</span>

soup.find_all(text=[<span class="hljs-string">&quot;Tillie&quot;</span>, <span class="hljs-string">&quot;Elsie&quot;</span>, <span class="hljs-string">&quot;Lacie&quot;</span>])
<span class="hljs-comment"># [u&apos;Elsie&apos;, u&apos;Lacie&apos;, u&apos;Tillie&apos;]</span>

soup.find_all(text=re.compile(<span class="hljs-string">&quot;Dormouse&quot;</span>))
[<span class="hljs-string">u&quot;The Dormouse&apos;s story&quot;</span>, <span class="hljs-string">u&quot;The Dormouse&apos;s story&quot;</span>]
</code></pre>
<h2 id="css&#x9009;&#x62E9;&#x5668;">CSS&#x9009;&#x62E9;&#x5668;</h2>
<p>&#x8FD9;&#x5C31;&#x662F;&#x53E6;&#x4E00;&#x79CD;&#x4E0E; find_all &#x65B9;&#x6CD5;&#x6709;&#x5F02;&#x66F2;&#x540C;&#x5DE5;&#x4E4B;&#x5999;&#x7684;&#x67E5;&#x627E;&#x65B9;&#x6CD5;.</p>
<ul>
<li><p>&#x5199; CSS &#x65F6;&#xFF0C;&#x6807;&#x7B7E;&#x540D;&#x4E0D;&#x52A0;&#x4EFB;&#x4F55;&#x4FEE;&#x9970;&#xFF0C;&#x7C7B;&#x540D;&#x524D;&#x52A0;<code>.</code>&#xFF0C;id&#x540D;&#x524D;&#x52A0;<code>#</code></p>
</li>
<li><p>&#x5728;&#x8FD9;&#x91CC;&#x6211;&#x4EEC;&#x4E5F;&#x53EF;&#x4EE5;&#x5229;&#x7528;&#x7C7B;&#x4F3C;&#x7684;&#x65B9;&#x6CD5;&#x6765;&#x7B5B;&#x9009;&#x5143;&#x7D20;&#xFF0C;&#x7528;&#x5230;&#x7684;&#x65B9;&#x6CD5;&#x662F; <code>soup.select()</code>&#xFF0C;&#x8FD4;&#x56DE;&#x7C7B;&#x578B;&#x662F; <code>list</code></p>
</li>
</ul>
<h3 id="&#xFF08;1&#xFF09;&#x901A;&#x8FC7;&#x6807;&#x7B7E;&#x540D;&#x67E5;&#x627E;">&#xFF08;1&#xFF09;&#x901A;&#x8FC7;&#x6807;&#x7B7E;&#x540D;&#x67E5;&#x627E;</h3>
<pre><code class="lang-python"><span class="hljs-keyword">print</span> soup.select(<span class="hljs-string">&apos;title&apos;</span>) 
<span class="hljs-comment">#[&lt;title&gt;The Dormouse&apos;s story&lt;/title&gt;]</span>

<span class="hljs-keyword">print</span> soup.select(<span class="hljs-string">&apos;a&apos;</span>)
<span class="hljs-comment">#[&lt;a class=&quot;sister&quot; href=&quot;http://example.com/elsie&quot; id=&quot;link1&quot;&gt;&lt;!-- Elsie --&gt;&lt;/a&gt;, &lt;a class=&quot;sister&quot; href=&quot;http://example.com/lacie&quot; id=&quot;link2&quot;&gt;Lacie&lt;/a&gt;, &lt;a class=&quot;sister&quot; href=&quot;http://example.com/tillie&quot; id=&quot;link3&quot;&gt;Tillie&lt;/a&gt;]</span>

<span class="hljs-keyword">print</span> soup.select(<span class="hljs-string">&apos;b&apos;</span>)
<span class="hljs-comment">#[&lt;b&gt;The Dormouse&apos;s story&lt;/b&gt;]</span>
</code></pre>
<h3 id="&#xFF08;2&#xFF09;&#x901A;&#x8FC7;&#x7C7B;&#x540D;&#x67E5;&#x627E;">&#xFF08;2&#xFF09;&#x901A;&#x8FC7;&#x7C7B;&#x540D;&#x67E5;&#x627E;</h3>
<pre><code class="lang-python"><span class="hljs-keyword">print</span> soup.select(<span class="hljs-string">&apos;.sister&apos;</span>)
<span class="hljs-comment">#[&lt;a class=&quot;sister&quot; href=&quot;http://example.com/elsie&quot; id=&quot;link1&quot;&gt;&lt;!-- Elsie --&gt;&lt;/a&gt;, &lt;a class=&quot;sister&quot; href=&quot;http://example.com/lacie&quot; id=&quot;link2&quot;&gt;Lacie&lt;/a&gt;, &lt;a class=&quot;sister&quot; href=&quot;http://example.com/tillie&quot; id=&quot;link3&quot;&gt;Tillie&lt;/a&gt;]</span>
</code></pre>
<h3 id="&#xFF08;3&#xFF09;&#x901A;&#x8FC7;-id-&#x540D;&#x67E5;&#x627E;">&#xFF08;3&#xFF09;&#x901A;&#x8FC7; id &#x540D;&#x67E5;&#x627E;</h3>
<pre><code class="lang-python"><span class="hljs-keyword">print</span> soup.select(<span class="hljs-string">&apos;#link1&apos;</span>)
<span class="hljs-comment">#[&lt;a class=&quot;sister&quot; href=&quot;http://example.com/elsie&quot; id=&quot;link1&quot;&gt;&lt;!-- Elsie --&gt;&lt;/a&gt;]</span>
</code></pre>
<h3 id="&#xFF08;4&#xFF09;&#x7EC4;&#x5408;&#x67E5;&#x627E;">&#xFF08;4&#xFF09;&#x7EC4;&#x5408;&#x67E5;&#x627E;</h3>
<p>&#x7EC4;&#x5408;&#x67E5;&#x627E;&#x5373;&#x548C;&#x5199; class &#x6587;&#x4EF6;&#x65F6;&#xFF0C;&#x6807;&#x7B7E;&#x540D;&#x4E0E;&#x7C7B;&#x540D;&#x3001;id&#x540D;&#x8FDB;&#x884C;&#x7684;&#x7EC4;&#x5408;&#x539F;&#x7406;&#x662F;&#x4E00;&#x6837;&#x7684;&#xFF0C;&#x4F8B;&#x5982;&#x67E5;&#x627E; p &#x6807;&#x7B7E;&#x4E2D;&#xFF0C;id &#x7B49;&#x4E8E; link1&#x7684;&#x5185;&#x5BB9;&#xFF0C;&#x4E8C;&#x8005;&#x9700;&#x8981;&#x7528;&#x7A7A;&#x683C;&#x5206;&#x5F00;</p>
<pre><code class="lang-python"><span class="hljs-keyword">print</span> soup.select(<span class="hljs-string">&apos;p #link1&apos;</span>)
<span class="hljs-comment">#[&lt;a class=&quot;sister&quot; href=&quot;http://example.com/elsie&quot; id=&quot;link1&quot;&gt;&lt;!-- Elsie --&gt;&lt;/a&gt;]</span>
</code></pre>
<p>&#x76F4;&#x63A5;&#x5B50;&#x6807;&#x7B7E;&#x67E5;&#x627E;&#xFF0C;&#x5219;&#x4F7F;&#x7528; <code>&gt;</code> &#x5206;&#x9694;</p>
<pre><code class="lang-python"><span class="hljs-keyword">print</span> soup.select(<span class="hljs-string">&quot;head &gt; title&quot;</span>)
<span class="hljs-comment">#[&lt;title&gt;The Dormouse&apos;s story&lt;/title&gt;]</span>
</code></pre>
<h3 id="&#xFF08;5&#xFF09;&#x5C5E;&#x6027;&#x67E5;&#x627E;">&#xFF08;5&#xFF09;&#x5C5E;&#x6027;&#x67E5;&#x627E;</h3>
<p>&#x67E5;&#x627E;&#x65F6;&#x8FD8;&#x53EF;&#x4EE5;&#x52A0;&#x5165;&#x5C5E;&#x6027;&#x5143;&#x7D20;&#xFF0C;&#x5C5E;&#x6027;&#x9700;&#x8981;&#x7528;&#x4E2D;&#x62EC;&#x53F7;&#x62EC;&#x8D77;&#x6765;&#xFF0C;&#x6CE8;&#x610F;&#x5C5E;&#x6027;&#x548C;&#x6807;&#x7B7E;&#x5C5E;&#x4E8E;&#x540C;&#x4E00;&#x8282;&#x70B9;&#xFF0C;&#x6240;&#x4EE5;&#x4E2D;&#x95F4;&#x4E0D;&#x80FD;&#x52A0;&#x7A7A;&#x683C;&#xFF0C;&#x5426;&#x5219;&#x4F1A;&#x65E0;&#x6CD5;&#x5339;&#x914D;&#x5230;&#x3002;</p>
<pre><code class="lang-python"><span class="hljs-keyword">print</span> soup.select(<span class="hljs-string">&apos;a[class=&quot;sister&quot;]&apos;</span>)
<span class="hljs-comment">#[&lt;a class=&quot;sister&quot; href=&quot;http://example.com/elsie&quot; id=&quot;link1&quot;&gt;&lt;!-- Elsie --&gt;&lt;/a&gt;, &lt;a class=&quot;sister&quot; href=&quot;http://example.com/lacie&quot; id=&quot;link2&quot;&gt;Lacie&lt;/a&gt;, &lt;a class=&quot;sister&quot; href=&quot;http://example.com/tillie&quot; id=&quot;link3&quot;&gt;Tillie&lt;/a&gt;]</span>

<span class="hljs-keyword">print</span> soup.select(<span class="hljs-string">&apos;a[href=&quot;http://example.com/elsie&quot;]&apos;</span>)
<span class="hljs-comment">#[&lt;a class=&quot;sister&quot; href=&quot;http://example.com/elsie&quot; id=&quot;link1&quot;&gt;&lt;!-- Elsie --&gt;&lt;/a&gt;]</span>
</code></pre>
<p>&#x540C;&#x6837;&#xFF0C;&#x5C5E;&#x6027;&#x4ECD;&#x7136;&#x53EF;&#x4EE5;&#x4E0E;&#x4E0A;&#x8FF0;&#x67E5;&#x627E;&#x65B9;&#x5F0F;&#x7EC4;&#x5408;&#xFF0C;&#x4E0D;&#x5728;&#x540C;&#x4E00;&#x8282;&#x70B9;&#x7684;&#x7A7A;&#x683C;&#x9694;&#x5F00;&#xFF0C;&#x540C;&#x4E00;&#x8282;&#x70B9;&#x7684;&#x4E0D;&#x52A0;&#x7A7A;&#x683C;</p>
<pre><code class="lang-python"><span class="hljs-keyword">print</span> soup.select(<span class="hljs-string">&apos;p a[href=&quot;http://example.com/elsie&quot;]&apos;</span>)
<span class="hljs-comment">#[&lt;a class=&quot;sister&quot; href=&quot;http://example.com/elsie&quot; id=&quot;link1&quot;&gt;&lt;!-- Elsie --&gt;&lt;/a&gt;]</span>
</code></pre>
<h3 id="6-&#x83B7;&#x53D6;&#x5185;&#x5BB9;">(6) &#x83B7;&#x53D6;&#x5185;&#x5BB9;</h3>
<p>&#x4EE5;&#x4E0A;&#x7684; select &#x65B9;&#x6CD5;&#x8FD4;&#x56DE;&#x7684;&#x7ED3;&#x679C;&#x90FD;&#x662F;&#x5217;&#x8868;&#x5F62;&#x5F0F;&#xFF0C;&#x53EF;&#x4EE5;&#x904D;&#x5386;&#x5F62;&#x5F0F;&#x8F93;&#x51FA;&#xFF0C;&#x7136;&#x540E;&#x7528; get_text() &#x65B9;&#x6CD5;&#x6765;&#x83B7;&#x53D6;&#x5B83;&#x7684;&#x5185;&#x5BB9;&#x3002;</p>
<pre><code class="lang-python">soup = BeautifulSoup(html, <span class="hljs-string">&apos;lxml&apos;</span>)
<span class="hljs-keyword">print</span> type(soup.select(<span class="hljs-string">&apos;title&apos;</span>))
<span class="hljs-keyword">print</span> soup.select(<span class="hljs-string">&apos;title&apos;</span>)[<span class="hljs-number">0</span>].get_text()

<span class="hljs-keyword">for</span> title <span class="hljs-keyword">in</span> soup.select(<span class="hljs-string">&apos;title&apos;</span>):
    <span class="hljs-keyword">print</span> title.get_text()
</code></pre>
<footer class="page-footer"><span class="copyright">Copyright &#xA9; BigCat all right reserved&#xFF0C;powered by Gitbook</span><span class="footer-modification">&#x300C;Revision Time:
2016-12-04 14:35:32&#x300D;
</span></footer>
                    
                    </section>
                
                
                </div>
            </div>
        </div>

        
        <a href="../../file/part02/2.4.html" class="navigation navigation-prev " aria-label="Previous page: 案例：使用XPath的爬虫"><i class="fa fa-angle-left"></i></a>
        
        
        <a href="../../file/part02/2.6.html" class="navigation navigation-next " aria-label="Next page: 案例：使用bs4的爬虫"><i class="fa fa-angle-right"></i></a>
        
    </div>
</div>

        
<script src="../../gitbook/app.js"></script>

    
    <script src="../../gitbook/plugins/gitbook-plugin-splitter/splitter.js"></script>
    

    
    <script src="../../gitbook/plugins/gitbook-plugin-toggle-chapters/toggle.js"></script>
    

    
    <script src="../../gitbook/plugins/gitbook-plugin-fontsettings/buttons.js"></script>
    

    
    <script src="../../gitbook/plugins/gitbook-plugin-livereload/plugin.js"></script>
    

<script>
require(["gitbook"], function(gitbook) {
    var config = {"disqus":{"shortName":"gitbookuse"},"github":{"url":"https://github.com/dododream"},"search-pro":{"cutWordLib":"nodejieba","defineWord":["gitbook-use"]},"sharing":{"weibo":true,"facebook":true,"twitter":true,"google":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"tbfed-pagefooter":{"copyright":"Copyright © BigCat","modify_label":"「Revision Time:","modify_format":"YYYY-MM-DD HH:mm:ss」"},"baidu":{"token":"ff100361cdce95dd4c8fb96b4009f7bc"},"sitemap":{"hostname":"http://www.treenewbee.top"},"donate":{"wechat":"http://weixin.png","alipay":"http://alipay.png","title":"","button":"赏","alipayText":"支付宝打赏","wechatText":"微信打赏"},"edit-link":{"base":"https://github.com/dododream/edit","label":"Edit This Page"},"splitter":{},"toggle-chapters":{},"highlight":{},"fontsettings":{"theme":"white","family":"sans","size":2},"livereload":{}};
    gitbook.start(config);
});
</script>

        <!-- body:end -->
    </body>
    <!-- End of book Python爬虫课程讲义 -->
</html>
